Introduction

Description of the Dataset:

The dataset can be found at https://www.kaggle.com/avikasliwal/used-cars-price-prediction#train-data.csv

It has following parameters:

Name: The brand and model of the car.
Location: The location in which the car is being sold or is available for purchase.
Year: The year or edition of the model.
Kilometers_Driven: The total kilometres driven in the car by the previous owner(s) in KM.
Fuel_Type: The type of fuel used by the car. (Petrol / Diesel / Electric / CNG / LPG)
Transmission: The type of transmission used by the car. (Automatic / Manual)
Owner_Type: Whether the ownership is Firsthand, Second hand or other.
Mileage: The standard mileage offered by the car company in kmpl or km/kg
Engine: The displacement volume of the engine in cc.
Power: The maximum power of the engine in bhp.
Seats: The number of seats in the car.
New_Price: The price of a new car of the same model.
Price: The price of the used car in INR Lakhs.

The price prediction for used cars is a very important part of used car business. Predicting prices accurately can help businesses and customers to settle deal at a fair price. Vehicle price prediction especially when the vehicle is used and not coming direct from the factory, is both a critical and important task. With increase in demand for used cars and upto 8 percent decrease in demand for the new cars in 2013,more and more vehicle buyers are finding alternatives of buying new cars outright.

Methods

#reading data from the csv
library(readr)
train_data = read_csv("./train-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Name = col_character(),
##   Location = col_character(),
##   Year = col_double(),
##   Kilometers_Driven = col_double(),
##   Fuel_Type = col_character(),
##   Transmission = col_character(),
##   Owner_Type = col_character(),
##   Mileage = col_character(),
##   Engine = col_character(),
##   Power = col_character(),
##   Seats = col_double(),
##   New_Price = col_character(),
##   Price = col_double()
## )
test_data = read_csv("./test-data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Name = col_character(),
##   Location = col_character(),
##   Year = col_double(),
##   Kilometers_Driven = col_double(),
##   Fuel_Type = col_character(),
##   Transmission = col_character(),
##   Owner_Type = col_character(),
##   Mileage = col_character(),
##   Engine = col_character(),
##   Power = col_character(),
##   Seats = col_double(),
##   New_Price = col_character()
## )

Cleaning the dataset

The first thing we did was to clean the dataset that we are given. In the dataset, the values for Power, Mileage and Engine are added as strings with their units along with them. So we had to remove those units and convert their types to numeric.

Variables like Location,Transmission, Fuel_Type and Owner_Type can be considered as factor variables therefore they were changed to factor variables.

Columns for X1 and Name were removed as they were not really affecting the price.

#clean data function
clean_data = function(data) {
  #removing New  Price column
  data = data[,-13]
  remove_units = function(values) {
    remove_units_value =  function(value) {
      as.numeric(sub("\\s+\\D+$", "", value))
    }
    unlist(lapply(values, remove_units_value))
  }
  
  data$Location = factor(data$Location)
  data$Transmission = factor(data$Transmission)
  data$Fuel_Type = factor(data$Fuel_Type)
  data$Owner_Type = factor(data$Owner_Type)
  data$Power = remove_units(data$Power)
  data$Mileage = remove_units(data$Mileage)
  data$Engine = remove_units(data$Engine)
  data = na.omit(data)
  data = data[-which(data$Mileage == 0),]
  
  #removing Name and X1 columnns from the dataset
  data = data[,-c(1,2)]
  
  return(data)
}

train_data = clean_data(train_data)
test_data = clean_data(test_data)

Splitting the dataset

We split our train_data into trn_data and tst_data. The test_data provided in the file with the dataset is for competition and does not include any Price paramter

smp_size = floor(0.75 * nrow(train_data))

## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)

trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]

head(trn_data)
## # A tibble: 6 x 11
##   Location  Year Kilometers_Driv… Fuel_Type Transmission Owner_Type Mileage
##   <fct>    <dbl>            <dbl> <fct>     <fct>        <fct>        <dbl>
## 1 Coimbat…  2017            24806 Petrol    Manual       First         18.2
## 2 Coimbat…  2016            44340 Diesel    Automatic    First         17.0
## 3 Delhi     2012            68038 Diesel    Manual       First         20.1
## 4 Delhi     2016            22000 Diesel    Manual       First         24.3
## 5 Ahmedab…  2011            76000 CNG       Manual       First         26.3
## 6 Mumbai    2015            29631 Petrol    Automatic    First         18.9
## # … with 4 more variables: Engine <dbl>, Power <dbl>, Seats <dbl>,
## #   Price <dbl>

Correlation between parameters

cor(trn_data[c("Kilometers_Driven","Mileage","Engine","Power")])
##                   Kilometers_Driven     Mileage      Engine       Power
## Kilometers_Driven        1.00000000 -0.05500041  0.08971581  0.04084805
## Mileage                 -0.05500041  1.00000000 -0.64849776 -0.56117361
## Engine                   0.08971581 -0.64849776  1.00000000  0.87573788
## Power                    0.04084805 -0.56117361  0.87573788  1.00000000

We can see from the correlation matrix that Power and Engine are very much correlated. Also Engine and Mileage also upto some extent. We will have to be careful before using all of them for our model.

Plotting relationships between the Price and other paramters

plot(Price ~ Power, data = trn_data, 
     col = "dodgerblue",
     main = "Price and Power relation")

plot(log(Price) ~ Power, data = trn_data, 
     col = "dodgerblue",
     main = "log(Price) and Power relation")

plot(log(Price) ~ log(Power), data = trn_data, 
     col = "dodgerblue",
     main = "log(Price) and log(Power) relation")

plot(Price ~ Mileage, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Mileage) relation")

plot(log(Price) ~ log(Mileage), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Mileage) relation")

plot(Price ~ Engine, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(log(Price) ~ log(Engine), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(Price ~ Kilometers_Driven, data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

plot(log(Price) ~ log(Kilometers_Driven), data = trn_data, col = "dodgerblue",
     main = "log(Price) and log(Engine) relation")

From these plots we were inferred that the logs of the above parameters are better suited for regression.

Functions to check Assumptions

plot_fit_res = function(model, title = "") {
  plot(fitted(model), resid(model), col = "grey", pch = 20,
  xlab = "Fitted", ylab = "Residuals", main = title)
  abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
  qqnorm(resid(model), main = title, col = "darkgrey")
  qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
  predicted = predict(model, newdata = data)
  plot(predicted, log(data$Price), col = "grey", pch = 20,
  xlab = "Predicted", ylab = "Actual", main = title)
  abline(a = 0, b = 1, col = "orange", lwd = 3)
}

Functions to compare different models

calc_aic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
  return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
  mod_sum = summary(model)
  return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
  y_hat = exp(predict(model, newdata = data))
  y = data$Price
  return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
  return(data.frame("AIC" = calc_aic(model),
                    "BIC" = calc_bic(model),
                    "LOOCV_RMSE" = calc_loocv_rmse(model),
                    "ADJ_R_SQ" = calc_adj_r2(model)
                    ))
}

Creating and testing different models

The first model we created was an additive model usig all the parameters.

model1 = lm(Price ~ ., data = trn_data)
calc_quality_criterions(model1)
##     AIC      BIC LOOCV_RMSE  ADJ_R_SQ
## 1 15951 16104.25   7.088395 0.7023269
plot_fit_res(model1, title = "Residuals vs fitted plot for Model1")

plot_qq(model1, title = "QQ plot for Model1")

As we have already seen from the plots that it is better to use the log for numeric parameters we also created the model using the logs for Power, Mileage, Engine and Kilometers driven.

model2 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats, data = trn_data)
calc_quality_criterions(model2)
##         AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -10990.32 -10837.07  0.2858198 0.8895492
plot_fit_res(model2, title = "Residuals vs fitted plot for Model2")

plot_qq(model2, title = "QQ plot for Model2")

We were able to improve all the criterias by just using log. AIC, BIC and LOOCV_RMSE decreased and ADJ_R_SQ increased.

We tried to further decrease the AIC and BIC by using the Backward search on the model2.

model3 = step(model2, direction = "backward")
## Start:  AIC=-10990.32
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1     0.066 353.27 -10991.5
## <none>                                353.20 -10990.3
## - log(Engine)             1     1.331 354.54 -10975.8
## - log(Mileage)            1     2.673 355.88 -10959.3
## - Owner_Type              3     4.607 357.81 -10939.5
## - log(Kilometers_Driven)  1    12.203 365.41 -10843.5
## - Fuel_Type               3    32.572 385.78 -10609.7
## - Transmission            1    39.314 392.52 -10529.8
## - Location               10    52.305 405.51 -10405.0
## - log(Power)              1   113.459 466.66  -9771.4
## - Year                    1   237.892 591.10  -8735.4
## 
## Step:  AIC=-10991.5
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power)
## 
##                          Df Sum of Sq    RSS      AIC
## <none>                                353.27 -10991.5
## - log(Engine)             1     1.268 354.54 -10977.8
## - log(Mileage)            1     2.680 355.95 -10960.4
## - Owner_Type              3     4.607 357.88 -10940.7
## - log(Kilometers_Driven)  1    12.537 365.81 -10840.7
## - Fuel_Type               3    33.162 386.43 -10604.3
## - Transmission            1    41.473 394.74 -10507.0
## - Location               10    52.407 405.68 -10405.2
## - log(Power)              1   129.739 483.01  -9622.5
## - Year                    1   251.587 604.86  -8636.5
n = length(resid(model2))
model4 = step(model2, direction = "backward", k = log(n))
## Start:  AIC=-10837.07
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1     0.066 353.27 -10844.6
## <none>                                353.20 -10837.1
## - log(Engine)             1     1.331 354.54 -10829.0
## - log(Mileage)            1     2.673 355.88 -10812.4
## - Owner_Type              3     4.607 357.81 -10805.4
## - log(Kilometers_Driven)  1    12.203 365.41 -10696.6
## - Fuel_Type               3    32.572 385.78 -10475.6
## - Transmission            1    39.314 392.52 -10382.9
## - Location               10    52.305 405.51 -10315.6
## - log(Power)              1   113.459 466.66  -9624.5
## - Year                    1   237.892 591.10  -8588.5
## 
## Step:  AIC=-10844.64
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Engine) + 
##     log(Power)
## 
##                          Df Sum of Sq    RSS    AIC
## <none>                                353.27 -10845
## - log(Engine)             1     1.268 354.54 -10837
## - log(Mileage)            1     2.680 355.95 -10820
## - Owner_Type              3     4.607 357.88 -10813
## - log(Kilometers_Driven)  1    12.537 365.81 -10700
## - Fuel_Type               3    33.162 386.43 -10476
## - Transmission            1    41.473 394.74 -10366
## - Location               10    52.407 405.68 -10322
## - log(Power)              1   129.739 483.01  -9482
## - Year                    1   251.587 604.86  -8496
calc_quality_criterions(model4)
##        AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -10991.5 -10844.64   0.285739 0.8895538

Using model4 we were able to reduce the LOOCV_RMSE and the ADJ_R_SQ further with minimal effect on AIC.

From the correlation testng earlier we had found that Engine and Power are highly corrlated. This means we should also try by removing the Engine Parameter and see if it improves anything.

model5 = lm(log(Price) ~ Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Power)+Seats, data = trn_data)
model5 = step(model5, direction = "backward", k = log(n))
## Start:  AIC=-10828.98
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Power) + Seats
## 
##                          Df Sum of Sq    RSS      AIC
## - Seats                   1      0.00 354.54 -10837.3
## <none>                                354.54 -10829.0
## - Owner_Type              3      4.56 359.10 -10798.1
## - log(Mileage)            1      6.34 360.88 -10759.7
## - log(Kilometers_Driven)  1     12.09 366.63 -10690.3
## - Transmission            1     38.38 392.91 -10386.9
## - Location               10     52.14 406.67 -10311.5
## - Fuel_Type               3     56.52 411.06 -10205.8
## - Year                    1    236.60 591.14  -8596.6
## - log(Power)              1    343.14 697.68  -7870.3
## 
## Step:  AIC=-10837.32
## log(Price) ~ Location + Year + log(Kilometers_Driven) + Fuel_Type + 
##     Transmission + Owner_Type + log(Mileage) + log(Power)
## 
##                          Df Sum of Sq    RSS      AIC
## <none>                                354.54 -10837.3
## - Owner_Type              3      4.56 359.10 -10806.5
## - log(Mileage)            1      9.04 363.58 -10735.3
## - log(Kilometers_Driven)  1     12.21 366.75 -10697.3
## - Transmission            1     40.25 394.79 -10374.4
## - Location               10     52.15 406.69 -10319.7
## - Fuel_Type               3     68.24 422.77 -10091.0
## - Year                    1    251.81 606.35  -8493.6
## - log(Power)              1    357.65 712.19  -7788.5
calc_quality_criterions(model5)
##        AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -10977.8 -10837.32   0.286194 0.8891829
plot_fit_res(model5, title = "Residuals vs fitted plot for Model5")

plot_qq(model5, title = "QQ plot for Model5")

plot_fit_pred(model5, trn_data, title = "Predicted vs Actual for Model5")

We then also tried some interaction models

model6 = lm(log(Price) ~ (Location+Year+log(Kilometers_Driven)+Fuel_Type+Transmission+Owner_Type+log(Mileage)+log(Engine)+log(Power)+Seats)^2, data = trn_data)
model7 = step(model6, direction = "backward", trace = FALSE)
calc_quality_criterions(model7)
##        AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -11908.8 -11346.88        Inf 0.9117736
plot_fit_res(model7, title = "Residuals vs fitted plot for Model7")

plot_qq(model7, title = "QQ plot for Model7")

plot_fit_pred(model7, trn_data, title = "Predicted vs Actual for Model7")
## Warning in predict.lm(model, newdata = data): prediction from a rank-
## deficient fit may be misleading

Using model7 we were further able to decrease the AIC and BIC from the additive models.

Results

We were able to find 2 models that were giving us very good results. One is additive and another is an interaction model.

Additive Model

model_add = model5
calc_quality_criterions(model_add)
##        AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -10977.8 -10837.32   0.286194 0.8891829
plot_fit_res(model_add, title = "Residuals vs fitted plot for Additive model")

plot_qq(model_add, title = "QQ plot for Additive model")

plot_fit_pred(model_add, trn_data, title = "Predicted vs Actual for Additive Model For train data")

plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Additive Model For test data")

calc_rmse(model_add, tst_data)
## [1] 4.387601

Interaction Model

model_int = model7
calc_quality_criterions(model_int)
##        AIC       BIC LOOCV_RMSE  ADJ_R_SQ
## 1 -11908.8 -11346.88        Inf 0.9117736
plot_fit_res(model_int, title = "Residuals vs fitted plot for Interaction model")

plot_qq(model_int, title = "QQ plot for Interaction model")

plot_fit_pred(model_int, trn_data, title = "Predicted vs Actual for Interaction model for train data")

plot_fit_pred(model_add, tst_data, title = "Predicted vs Actual for Interaction Model For test data")

calc_rmse(model_int, tst_data)
## [1] 3.627951

Discussion

In context of the data, both the models use the log(Price) as the response which as shown earlier gives a better result. Parameters like Name and ID were removed from both the models. The numerical values like Kilometers_driven, Engine, Power and Mileage were shown to waork bettter when used with log. There were many dummy parameters that were used to accomodate the factor variables like Owner_Type, Transmission, Location.

The AIC and BIC values for both the models are less which is good. The LOOCV_RMSE values are also less. The adjusted r squared values are close to 1. The residual vs fitted plot shows that both the models follow the equal assumption to some extent and the qq plots shows that the models also follow normality assumption.

The Interaction model has less RMSE than Additive model, whcih means that the interaction model gives better results than the Additive model.

Appendinx

library(readr)

#reading data from the csv
train_data = read_csv("./train-data.csv")
test_data = read_csv("./test-data.csv")

#clean data function
clean_data = function(data) {
  #removing New  Price column
  data = data[,-13]
  remove_units = function(values) {
    remove_units_value =  function(value) {
      as.numeric(sub("\\s+\\D+$", "", value))
    }
    unlist(lapply(values, remove_units_value))
  }
  data$Location = factor(data$Location)
  data$Transmission = factor(data$Transmission)
  data$Fuel_Type = factor(data$Fuel_Type)
  data$Owner_Type = factor(data$Owner_Type)
  data$Power = remove_units(data$Power)
  data$Mileage = remove_units(data$Mileage)
  data$Engine = remove_units(data$Engine)
  data = na.omit(data)
  data = data[-which(data$Mileage == 0),]
  data = data[,-c(1,2)]
  return(data)
}

loocv_rmse = function(model) {
  sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
}

train_data = clean_data(train_data)
test_data = clean_data(test_data)

pairs(train_data[c("Power","Mileage","Engine","Price")])

plot(log(Price) ~ log(Power), data = train_data)
train_data_nm = train_data
mileage_transform = function(x) {
  return(-x)
}
plot(log(Price) ~ mileage_transform(Mileage), data = train_data_nm)

plot(log(Price) ~ log(Engine), data = train_data)

train_data_nm = train_data
train_data_nm = train_data[-which.max(train_data$Kilometers_Driven),]
plot(log(Price) ~ log(Kilometers_Driven), data = train_data_nm)

train_data = train_data[,-c(1,2)]
model = lm(Price ~ ., data =  train_data)
model_aic = step(model, direction = "backward")
anova(model_aic, model)

log_model = lm(log(Price) ~ .,data =  train_data)
log_model_aic = step(log_model, direction = "backward")

loocv_rmse(log_model)
loocv_rmse(log_model_aic)
sqrt(mean(resid(log_model_aic)^2))

sqrt(mean(resid(model_aic)^2))
loocv_rmse(model_aic)


log_model_int = lm(log(Price) ~ (.)^2, data = train_data)
log_model_int_aic = step(log_model_int, direction = "backward")
loocv_rmse(log_model_int_aic)
anova(log_model_aic, log_model_int_aic)
sqrt(mean(resid(log_model_int_aic)^2))

price_test_hat = exp(predict(log_model_int_aic, newdata = test_data))
train_data = clean_data(train_data)
test_data = clean_data(test_data)

smp_size = floor(0.75 * nrow(train_data))

## set the seed to make your partition reproducible
set.seed(42)
train_ind = sample(seq_len(nrow(train_data)), size = smp_size)

trn_data = train_data[train_ind, ]
tst_data = train_data[-train_ind, ]

plot_fit_res = function(model, title = "") {
  plot(fitted(model), resid(model), col = "grey", pch = 20,
  xlab = "Fitted", ylab = "Residuals", main = title)
  abline(h = 0, col = "darkorange", lwd = 2)
}
plot_qq = function(model, title = "") {
  qqnorm(resid(model), main = title, col = "darkgrey")
  qqline(resid(model), col = "dodgerblue", lwd = 2)
}
plot_fit_pred = function(model, data, title = "") {
  predicted = predict(model, newdata = data)
  plot(predicted, log(data$Price), col = "grey", pch = 20,
  xlab = "Predicted", ylab = "Actual", main = title)
  abline(a = 0, b = 1, col = "orange", lwd = 3)
}
calc_aic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+2*p)
}
calc_bic = function(model) {
  rss = sum(resid(model)^2)
  n = length(resid(model))
  p = length(model$coefficients)
  return(n*log(rss/n)+(log(n)*p))
}
calc_loocv_rmse = function(model) {
  return(sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2)))
}
calc_adj_r2 = function(model) {
  mod_sum = summary(model)
  return(mod_sum[["adj.r.squared"]])
}
calc_rmse = function(model, data) {
  y_hat = exp(predict(model, newdata = data))
  y = data$Price
  return(sqrt(mean((y_hat - y)^2)))
}
calc_quality_criterions = function(model) {
  return(data.frame("AIC" = calc_aic(model),
                    "BIC" = calc_bic(model),
                    "LOOCV_RMSE" = calc_loocv_rmse(model),
                    "ADJ_R_SQ" = calc_adj_r2(model)
                    ))
}